Maximum likelihood (MLE) and method of moments (MM) are two common methods for constructing a model. For this assignment, we are going to model the unknown distributions with maximum likelihood and method of moments.
library(tidyverse)
require(dplyr)
library(stats4)
Hmisc::getHdata(nhgh)
d1 <- nhgh %>%
filter(sex == "female") %>%
filter(age >= 18) %>%
select(gh, ht) %>%
filter(1:n()<=1000)
In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.
gh <- d1$gh
fun <- function(a, b)
sum(-log(dnorm(gh, a, b)))
z <- mle(minuslogl = fun, start = list(a = 6, b = 1))
## Warning in dnorm(gh, a, b): NaNs produced
## Warning in dnorm(gh, a, b): NaNs produced
## Warning in dnorm(gh, a, b): NaNs produced
a <- coef(z)[1]
b <- coef(z)[2]
a
## a
## 5.724595
b
## b
## 1.051692
Then, we are going to overlay the estimated pdf onto histogram.
hist(gh, breaks = 20, freq = FALSE)
curve(dnorm(x, a, b), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- gh
q_candidate <- qnorm
x <- q_candidate((1:200)/200, a, b)
y <- quantile(gh, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qnorm(0.5, a, b)
## [1] 5.724595
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rnorm(1e4, a, b)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.05184657
In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.
gh <- d1$gh
fun <- function(a, b)
sum(-log(dgamma(gh, a, b)))
z <- mle(minuslogl = fun, start = list(a = 40, b = 7))
## Warning in dgamma(gh, a, b): NaNs produced
a <- coef(z)[1]
b <- coef(z)[2]
a
## a
## 40.81883
b
## b
## 7.130414
Then, we are going to overlay the estimated pdf onto histogram.
hist(gh, breaks = 20, freq = FALSE)
curve(dgamma(x, a, b), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- gh
q_candidate <- qgamma
x <- q_candidate((1:200)/200, a, b)
y <- quantile(gh, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qgamma(0.5, a, b)
## [1] 5.677929
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rgamma(1e4, a, b)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.04385128
In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.
gh <- d1$gh
fun <- function(a, b)
sum(-log(dweibull(gh, a, b)))
z <- mle(minuslogl = fun, start = list(a = 4, b = 6))
a <- coef(z)[1]
b <- coef(z)[2]
a
## a
## 4.125254
b
## b
## 6.173884
Then, we are going to overlay the estimated pdf onto histogram.
hist(gh, breaks = 20, freq = FALSE)
curve(dweibull(x, a, b), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- gh
q_candidate <- qweibull
x <- q_candidate((1:200)/200, a, b)
y <- quantile(gh, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qweibull(0.5, a, b)
## [1] 5.64902
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rweibull(1e4, a, b)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.07738035
In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.
gh <- d1$gh
xb <- mean(gh)
s2 <- var(gh)
mu <- xb
sigma <- sqrt(s2)
mu
## [1] 5.7246
sigma
## [1] 1.052246
Then, we are going to overlay the estimated pdf onto histogram.
hist(gh, breaks = 20, freq = FALSE)
curve(dnorm(x, mu, sigma), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, mu, sigma), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- gh
q_candidate <- qnorm
x <- q_candidate((1:200)/200, mu, sigma)
y <- quantile(gh, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qnorm(0.5, mu, sigma)
## [1] 5.7246
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rnorm(1e4, mu, sigma)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.05142749
In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.
gh <- d1$gh
xb <- mean(gh)
s2 <- var(gh)
lh <- xb/s2
ch <- xb^2/s2
lh
## [1] 5.170237
ch
## [1] 29.59754
Then, we are going to overlay the estimated pdf onto histogram.
hist(gh, breaks = 20, freq = FALSE)
curve(dgamma(x, ch, lh), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, ch, lh), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- gh
q_candidate <- qgamma
x <- q_candidate((1:200)/200, ch, lh)
y <- quantile(gh, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qgamma(0.5, ch, lh)
## [1] 5.660259
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rgamma(1e4, ch, lh)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.05112776
In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.
gh <- d1$gh
xb <- mean(gh)
s2 <- var(gh)
cv2 = s2/xb^2
fun <- function(beta, cv2){
abs(gamma(1 + 2/beta)/(gamma(1 + 1/beta))^2 - (1 + cv2))
}
k = optimize(fun, interval = c(0, 100), cv2 = cv2, maximum = FALSE)$minimum
lambda = xb/gamma(1 + 1/k)
k
## [1] 6.353198
lambda
## [1] 6.151307
Then, we are going to overlay the estimated pdf onto histogram.
hist(gh, breaks = 20, freq = FALSE)
curve(dweibull(x, k, lambda), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(gh), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, k, lambda), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- gh
q_candidate <- qweibull
x <- q_candidate((1:200)/200, k, lambda)
y <- quantile(gh, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qweibull(0.5, k, lambda)
## [1] 5.806483
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rweibull(1e4, k, lambda)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.05162193
In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.
ht <- d1$ht
fun <- function(a, b)
sum(-log(dnorm(ht, a, b)))
z <- mle(minuslogl = fun, start = list(a = 160, b = 10))
## Warning in dnorm(ht, a, b): NaNs produced
a <- coef(z)[1]
b <- coef(z)[2]
a
## a
## 160.7412
b
## b
## 7.315558
Then, we are going to overlay the estimated pdf onto histogram.
hist(ht, breaks = 20, freq = FALSE)
curve(dnorm(x, a, b), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- ht
q_candidate <- qnorm
x <- q_candidate((1:200)/200, a, b)
y <- quantile(ht, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qnorm(0.5, a, b)
## [1] 160.7412
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rnorm(1e4, a, b)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.3591797
In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.
ht <- d1$ht
fun <- function(a, b)
sum(-log(dgamma(ht, a, b)))
z <- mle(minuslogl = fun, start = list(a = 450, b = 2))
a <- coef(z)[1]
b <- coef(z)[2]
a
## a
## 480.4147
b
## b
## 2.988733
Then, we are going to overlay the estimated pdf onto histogram.
hist(ht, breaks = 20, freq = FALSE)
curve(dgamma(x, a, b), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- ht
q_candidate <- qgamma
x <- q_candidate((1:200)/200, a, b)
y <- quantile(ht, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qgamma(0.5, a, b)
## [1] 160.6304
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rgamma(1e4, a, b)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.361636
In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.
ht <- d1$ht
fun <- function(a, b)
sum(-log(dweibull(ht, a, b)))
z <- mle(minuslogl = fun, start = list(a = 20, b = 160))
a <- coef(z)[1]
b <- coef(z)[2]
a
## a
## 21.85402
b
## b
## 164.2472
Then, we are going to overlay the estimated pdf onto histogram.
hist(ht, breaks = 20, freq = FALSE)
curve(dweibull(x, a, b), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, a, b), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- ht
q_candidate <- qweibull
x <- q_candidate((1:200)/200, a, b)
y <- quantile(ht, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qweibull(0.5, a, b)
## [1] 161.5156
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rweibull(1e4, a, b)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.4170276
In this part, we assume that the Glycohemoglobin data is sampled from the normal distribution, and we plan to calculate the parameters first.
ht <- d1$ht
xb <- mean(ht)
s2 <- var(ht)
mu <- xb
sigma <- sqrt(s2)
mu
## [1] 160.7419
sigma
## [1] 7.320161
Then, we are going to overlay the estimated pdf onto histogram.
hist(ht, breaks = 20, freq = FALSE)
curve(dnorm(x, mu, sigma), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pnorm(x, mu, sigma), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- ht
q_candidate <- qnorm
x <- q_candidate((1:200)/200, mu, sigma)
y <- quantile(ht, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qnorm(0.5, mu, sigma)
## [1] 160.7419
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rnorm(1e4, mu, sigma)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.3602376
In this part, we assume that the Glycohemoglobin data is sampled from the Gamma distribution, and we plan to calculate the parameters first.
ht <- d1$ht
xb <- mean(ht)
s2 <- var(ht)
lh <- xb/s2
ch <- xb^2/s2
lh
## [1] 2.999769
ch
## [1] 482.1886
Then, we are going to overlay the estimated pdf onto histogram.
hist(ht, breaks = 20, freq = FALSE)
curve(dgamma(x, ch, lh), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pgamma(x, ch, lh), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- ht
q_candidate <- qgamma
x <- q_candidate((1:200)/200, ch, lh)
y <- quantile(ht, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x, y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qgamma(0.5, ch, lh)
## [1] 160.6308
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rgamma(1e4, ch, lh)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.3603169
In this part, we assume that the Glycohemoglobin data is sampled from the Weibull distribution, and we plan to calculate the parameters first.
ht <- d1$ht
xb <- mean(ht)
s2 <- var(ht)
cv2 = s2/xb^2
fun <- function(beta, cv2){
abs(gamma(1 + 2/beta)/(gamma(1 + 1/beta))^2 - (1 + cv2))
}
k = optimize(fun, interval = c(0, 100), cv2 = cv2, maximum = FALSE)$minimum
lambda = xb/gamma(1 + 1/k)
k
## [1] 27.45941
lambda
## [1] 163.9807
Then, we are going to overlay the estimated pdf onto histogram.
hist(ht, breaks = 20, freq = FALSE)
curve(dweibull(x, k, lambda), add = TRUE)
Overlay the estimated cdf onto the ecdf
plot(ecdf(ht), cex = 0.1, col = "red", main = "", ylab = "Density")
curve(pweibull(x, k, lambda), add = TRUE)
legend('bottomright', legend = c("estimated cdf", "ecdf"), col = c("black", "red"), lty = 1, lwd = 2)
QQ plot
random_sample <- ht
q_candidate <- qweibull
x <- q_candidate((1:200)/200, k, lambda)
y <- quantile(ht, probs = (1:200)/200)
tgsify::plotstyle(style = upright)
plot(x,y, asp = 1, xlab = "Estimated quantile", ylab = "Sample quantile")
abline(0,1)
Estimated median
qweibull(0.5, k, lambda)
## [1] 161.8065
Median sampling distribution
median <- c()
for (i in 1:1e5) {
data <- rweibull(1e4, k, lambda)
median <- c(median, median(data))
}
hist(median, breaks = 20, freq = FALSE)
Range of middle 95% of sampling distribution
range = quantile(median, c(0.025, 0.975))
range[2] - range[1]
## 97.5%
## 0.3318572